Load FDIC data and Fannie Mae data
Fannie Mae data load uses a custom function that selects specific columns and only loads 1/3rd at a time to comply with 16 GB of RAM. It then collapses the data on acquisitions.
# Core Packages
import pandas as pd
import numpy as np
import random
import glob
import pickle
import zipfile
"""
Grabs the entire Federal Deposit Insurance Corporation (FDIC) Statistics on
Depository Institutions (SDI) data set.
Note that this is a large data set! There are roughly 85 zip files each of
which is between 40 and 84 MB.
"""
import pandas as pd
import requests
fld = '..\Data\FDIC\\'
base_url = 'https://www7.fdic.gov/sdi/Resource/AllReps/All_Reports_'
# use pandas to construct a list of quarterly dates
present = '20081231'
datetimes = pd.date_range('19930331', end=present, freq='Q')
dates = datetimes.format(formatter=lambda t: t.strftime('%Y%m%d'))
for date in dates:
    print(date)
    # ...construct the url...
    tmp_url = base_url + date + '.zip'
    # ...make the connection and grab the zipped files...
    tmp_buffer = requests.get(tmp_url)
    # ...save them to disk...
    with open(fld + 'All_Reports_' + date + '.zip', 'wb') as tmp_zip_file:
        tmp_zip_file.write(tmp_buffer.content)
    print('Done with files for ' + date + '!')
 def data_load_save(fld, file):
        
    tmp_buffer = zipfile.ZipFile(str(fld + file + '.zip'))
    tmp_file = tmp_buffer.namelist()[0]
    full_cols = ["Reference Pool ID","Loan Identifier","Monthly Reporting Period","Channel","Seller Name","Servicer Name","Master Servicer","Original Interest Rate","Current Interest Rate","Original UPB","UPB at Issuance","Current Actual UPB","Original Loan Term","Origination Date","First Payment Date","Loan Age","Remaining Months to Legal Maturity","Remaining Months To Maturity","Maturity Date","Original Loan to Value Ratio (LTV)","Original Combined Loan to Value Ratio (CLTV)","Number of Borrowers","Debt-To-Income (DTI)","Borrower Credit Score at Origination","Co-Borrower Credit Score at Origination","First Time Home Buyer Indicator","Loan Purpose","Property Type","Number of Units","Occupancy Status","Property State","Metropolitan Statistical Area (MSA)","Zip Code Short","Mortgage Insurance Percentage","Amortization Type","Prepayment Penalty Indicator","Interest Only Loan Indicator","Interest Only First Principal And Interest Payment Date","Months to Amortization","Current Loan Delinquency Status","Loan Payment History","Modification Flag","Mortgage Insurance Cancellation Indicator","Zero Balance Code","Zero Balance Effective Date","UPB at the Time of Removal","Repurchase Date","Scheduled Principal Current","Total Principal Current","Unscheduled Principal Current","Last Paid Installment Date",
                 "Foreclosure Date","Disposition Date","Foreclosure Costs","Property Preservation and Repair Costs","Asset Recovery Costs","Miscellaneous Holding Expenses and Credits","Associated Taxes for Holding Property","Net Sales Proceeds","Credit Enhancement Proceeds","Repurchase Make Whole Proceeds","Other Foreclosure Proceeds","Non-Interest Bearing UPB","Principal Forgiveness Amount","Original List Start Date","Original List Price","Current List Start Date","Current List Price","Borrower Credit Score At Issuance","Co-Borrower Credit Score At Issuance","Borrower Credit Score Current","Co-Borrower Credit Score Current","Mortgage Insurance Type","Servicing Activity Indicator","Current Period Modification Loss Amount","Cumulative Modification Loss Amount","Current Period Credit Event Net Gain or Loss","Cumulative Credit Event Net Gain or Loss","HomeReady® Program Indicator","Foreclosure Principal Write-off Amount","Relocation Mortgage Indicator","Zero Balance Code Change Date","Loan Holdback Indicator","Loan Holdback Effective Date","Delinquent Accrued Interest","Property Valuation Method","High Balance Loan Indicator","ARM Initial Fixed-Rate Period ≤ 5 YR Indicator","ARM Product Type","Initial Fixed-Rate Period","Interest Rate Adjustment Frequency","Next Interest Rate Adjustment Date",
                 "Next Payment Change Date","Index","ARM Cap Structure","Initial Interest Rate Cap Up Percent","Periodic Interest Rate Cap Up Percent","Lifetime Interest Rate Cap Up Percent","Mortgage Margin","ARM Balloon Indicator","ARM Plan Number","Borrower Assistance Plan","High Loan to Value (HLTV) Refinance Option Indicator","Deal Name","Repurchase Make Whole Proceeds Flag","Alternative Delinquency Resolution","Alternative Delinquency Resolution Count","Total Deferral Amount"]
    cols_1 = ["Loan Identifier","Monthly Reporting Period","Channel","Seller Name",
              "Servicer Name","Master Servicer","Original Interest Rate",
              "Original UPB","Original Loan Term","Origination Date"]
    cols_2 = ["Loan Identifier","Monthly Reporting Period",
              "First Payment Date",
              "Original Loan to Value Ratio (LTV)","Original Combined Loan to Value Ratio (CLTV)",
              "Number of Borrowers","Debt-To-Income (DTI)","Borrower Credit Score at Origination"]
    cols_3 = ["Loan Identifier","Monthly Reporting Period","Co-Borrower Credit Score at Origination",
              "First Time Home Buyer Indicator","Loan Purpose","Property Type","Number of Units",
              "Occupancy Status","Property State","Zip Code Short","Mortgage Insurance Percentage",
              "Zero Balance Code","Mortgage Insurance Type","Relocation Mortgage Indicator"]
        
    # First columns subset
    usecols_1 = list()
    for col in cols_1:
        index = full_cols.index(col)
        usecols_1.append(index)      
    # Second columns subset
    usecols_2 = list()
    for col in cols_2:
        index = full_cols.index(col)
        usecols_2.append(index) 
    # Third columns subset
    usecols_3 = list()
    for col in cols_3:
        index = full_cols.index(col)
        usecols_3.append(index) 
        
    # First columns
    df1 = pd.read_csv(tmp_buffer.open(tmp_file), sep='|', header=None, usecols=usecols_1)
    df1.columns = cols_1
    df1.columns = ["Loan ID","Reported Period","Origination Channel","Lender",
                   "Servicer","Master Servicer","Original Interest Rate","Original Mortgage Amount",
                   "Original Loan Term","Original Date"]
    # Drop duplicates
    df1 = df1.drop_duplicates(subset='Loan ID', keep='last')
    
    # Second columns
    df2 = pd.read_csv(tmp_buffer.open(tmp_file), sep='|', header=None, usecols=usecols_2)
    df2.columns = cols_2
    df2.columns = ["Loan ID","Reported Period","First Payment",
                   "Original Loan-to-Value (LTV)","Original Combined Loan-to-Value (CLTV)",
                   "Number of Borrowers","Original Debt to Income Ratio","Credit Score"]
    
    # Drop duplicates
    df2 = df2.drop_duplicates(subset='Loan ID', keep='last')
    df2 = df2.drop(columns=["Reported Period", 'Loan ID'])
    
    # Second columns
    df3 = pd.read_csv(tmp_buffer.open(tmp_file), sep='|', header=None, usecols=usecols_3)
    df3.columns = cols_3
    df3.columns = ["Loan ID","Reported Period","Co-Borrower Credit Score",
                   "First Time Home Buyer","Loan Purpose","Property Type","Number of Units",
                   "Occupancy Type","Property State","Zip Code","Mortgage Insurance %",
                   "Zero Balance Code","Mortgage Insurance Type","Relocation Mortgage Indicator"]
    # Foreclosures
    df3['Zero Balance Code'] = df3['Zero Balance Code'].apply(lambda x: 1 if ((x == 2) | 
                                                                              (x == 3) |
                                                                              (x == 6) | 
                                                                              (x == 9) | 
                                                                              (x == 15)) else 0)
    Foreclosed = pd.DataFrame()
    Foreclosed = df3[['Loan ID', 'Zero Balance Code']].groupby(['Loan ID']).sum().reset_index()
    print(Foreclosed['Zero Balance Code'].value_counts())
    # Drop duplicates   
    df3 = df3.drop_duplicates(subset='Loan ID', keep='last')
    df3 = df3.drop(columns=["Reported Period", 'Loan ID'])
    
    # Combine loaded data
    df = pd.concat([df1, df2, df3], axis=1)
    print("Subsetted shape:", df.shape)
    print("")
    
    # Set Foreclosed to binary
    df = df.reset_index()
    df['Foreclosed'] = Foreclosed['Zero Balance Code'].apply(lambda x: 1 if x >= 1 else 0)
    df = df.iloc[:,1:len(df)]
    df.to_csv(str(fld + 'Acq_' + file + '.csv'), index=False) 
    
    # return(df)
fld = '../Data/'
files = ['2001Q1','2001Q2','2001Q3','2001Q4','2002Q1','2002Q2','2002Q3','2002Q4',
         '2003Q1','2003Q2','2003Q3','2003Q4','2004Q1','2004Q2','2004Q3','2004Q4',
         '2005Q1','2005Q2','2005Q3','2005Q4','2006Q1','2006Q2','2006Q3','2006Q4',
         '2007Q1','2007Q2','2007Q3','2007Q4','2008Q1','2008Q2','2008Q3','2008Q4']
for file in files:
    print('Data Load for', file)
    data_load_save(fld, file)
    print('')